import sys
sys.path.insert(0,'..') if '..' not in sys.path else _;
%load_ext autoreload
%autoreload 2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from itertools import product
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score,TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.linear_model import LassoCV, RidgeCV
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, pacf
from statsmodels.stats.diagnostic import kstest_normal
import statsmodels.api as sm
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error,\
mean_squared_error, mean_squared_log_error
import Helpers.TS as ts
import tradingeconomics as te
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from secret import enable_te
data_file = os.path.join('..','Data','USD_GDP_YOY.csv')
def getchFromTE(initData = '1960-01-01'):
"""
Logs into TradingEconomics and fetches gdo YoY data
"""
enable_te()
tmp = te.getHistoricalData(country='United States', \
indicator='GDP Growth Rate', \
initDate=initData)
tmp.columns = ['gdp']
tmp.index.freq = 'Q-Dec'
return tmp
def fetchGDP(datafile=data_file):
"""
First checks if the data is already downloaded
If the data files exists, then it avoids loggin into TE
"""
file_exists = os.path.isfile(datafile)
gdp = pd.read_csv('../Data/USD_GDP_YOY.csv',parse_dates=['date']).set_index('date')\
if file_exists else getchFromTE()
gdp.index.freq = 'Q'
return gdp
df = fetchGDP(data_file)
df.head()
df.plot(figsize = [12,7],title = 'USD GDP YoY')
upper,lower = np.percentile(df.gdp,q=[10,90])
plt.axhline(upper,linestyle='--',color = 'purple', label = '90th percentile')
plt.axhline(lower,linestyle='--',color = 'red', label = '10th percentile')
plt.yticks(ticks = np.arange(start = -12,stop = 20, step = 2))
plt.ylabel('US GDP YoY')
plt.legend()
#plt.text(x = -3, y = upper + 5, s = '90 Percentile', fontsize = 14)
sns.despine()
plt.figure(figsize=[12,7])
mean_gdp , std_gdp = (df.mean()[0], df.std()[0])
sns.distplot(df.gdp,rug=True,bins = 30, hist_kws = {'fill' : False})
plt.axvline(x= mean_gdp + std_gdp, label = 'Mean + SD', linestyle = '--', color = 'green')
plt.axvline(x= mean_gdp - std_gdp, label = 'Mean - SD', linestyle = '--', color = 'red')
plt.title('\nUSD GDP YoY Distribution\n\n', fontsize = 20)
plt.text(x = mean_gdp + std_gdp + 0.05, y = 0.18 ,\
s = "GDP AT 1 STD {0:.00f}%".format(mean_gdp + std_gdp), \
fontsize = 14, color = 'green' )
plt.text(x = mean_gdp - std_gdp - 8, y = 0.18 ,\
s = "GDP AT 1 STD {0:.00f}%".format(mean_gdp - std_gdp), fontsize = 14,\
horizontalalignment = 'left', color = 'red')
plt.legend()
plt.xticks(ticks = np.arange(start = -12,stop = 20, step = 2))
sns.despine()
mean_gdp , std_gdp
fig = sm.qqplot(df.values.squeeze(),line='r')
fig.set_size_inches(10,6)
sns.despine()
stat, pval = kstest_normal(df.values.squeeze())
print('The p-val is {}'.format(pval))
s = 'The distribution is Not Normal ' if pval <= 0.05 else 'The distribution is normal'
print(s)
ts.plotMovingAverage(df, window = 12, plot_intervals=True,\
scale=1.96, plot_anomalies=True,\
figsize = [15,7])
ts.plotMovingAverage(df, window = 24, plot_intervals=True,\
scale=1.96, plot_anomalies=True,\
figsize = [15,7])
ts.plotMovingAverage(df, window = 8, plot_intervals=True,\
scale=1.96, plot_anomalies=True,\
figsize = [15,7])
ts.compareMAs(df,window_1 = 8, window_2 = 4)
ts.plotMovingSDs(df,24)
results = seasonal_decompose(df,extrapolate_trend=1)
fig = results.plot()
fig.set_size_inches(10,6)
sns.despine()
ts.tsplot(results.resid.values.squeeze(), lags=30)
ts.tsplot(df.values.squeeze(), lags=60)
pacfs, confint = pacf(df.values.squeeze(),nlags = 60, alpha = 0.05)
np.where(np.abs(pacfs) > 0.10)
It is surprising that initial series itself is stationary. The Dicky Fuller Test has rejected null hypothesis that unit root is present. Although, we did get the hint looking at moving averages and standard deviation. Interestingly, PACF shows long term auto-correlations hinting seasonality can be present.
ps = range(1,3)
d = 1
qs = range(1,3)
Ps = range(0,3)
D = 1
Qs = range(0,3)
s = 12 # season length of 4 quaerters
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)
results = []
best_aic = float("inf")
df.index.freq = 'Q'
%%time
result_table = ts.optimizeSARIMA(df,parameters_list, d, D, s)
result_table.head()
# set the parameters that give the lowest AIC
p, q, P, Q = result_table.parameters[0]
best_model=sm.tsa.statespace.SARIMAX(df, order=(p, d, q),
seasonal_order=(P, D, Q, s)).fit(disp=-1)
print(best_model.summary())
lag = 12
df_diff = df.gdp - df.gdp.shift(lag)
ts.tsplot(df_diff[lag + 1:],lags = 30)
ts.tsplot(best_model.resid[s+1:], lags=30)
ts.plotSARIMA(df, best_model, 12, 1, 8)
data = pd.DataFrame(df.copy())
data.columns = ["y"]
for i in range(2, 16):
data["lag_{}".format(i)] = data.y.shift(i)
data.tail(7)
# for time-series cross-validation set 5 folds
tscv = TimeSeriesSplit(n_splits=3)
y = data.dropna().y
X = data.dropna().drop(['y'], axis=1)
# reserve 30% of data for testing
X_train, X_test, y_train, y_test = ts.timeseries_train_test_split(X, y, test_size=0.3)
# Apply linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
ts.plotModelResults(lr,X_train,X_test,y_train, y_test, tscv, plot_intervals=True)
ts.plotCoefficients(lr,X_train)
data_Q = data.copy(deep=True)
data_Q['quarter'] = data_Q.index.quarter
data_Q.tail()
quarter_dummies = pd.get_dummies(data_Q.quarter,prefix='Q')
quarter_dummies.tail()
data_Q = pd.concat([data_Q, quarter_dummies],axis = 1)
data_Q.columns
data_Q = data_Q.drop(labels = ['Q_4','quarter'],axis = 1) ## Drop unnecesary variable
data_Q.tail()
y = data_Q.dropna().y
X = data_Q.dropna().drop(['y'], axis=1)
# reserve 30% of data for testing
X_train, X_test, y_train, y_test = ts.timeseries_train_test_split(X, y, test_size=0.3)
lr = LinearRegression()
lr.fit(X_train, y_train)
ts.plotModelResults(lr,X_train,X_test,y_train, y_test, tscv, plot_intervals=True)
ts.plotCoefficients(lr,X_train)
X_train.corr()
plt.figure(figsize=(15, 8))
sns.heatmap(X_train.corr(), alpha = 0.6);
ridge = RidgeCV(cv=tscv)
ridge.fit(X_train, y_train)
ts.plotModelResults(ridge,
X_train=X_train,
X_test=X_test,
y_train = y_train, y_test = y_test, tscv = tscv,
plot_intervals=True, plot_anomalies=True)
ts.plotCoefficients(ridge,X_train)
y = data.dropna().y
X = data.dropna().drop(['y'], axis=1)
X_train, X_test, y_train, y_test = ts.timeseries_train_test_split(X, y, test_size=0.3)
xgb = XGBRegressor(max_depth=1, n_estimators = 10, learning_rate = 0.07)
xgb.fit(X_train, y_train);
%timeit
ts.plotModelResults(xgb,
X_train=X_train,
X_test=X_test,
y_train = y_train, y_test = y_test, tscv = tscv,
plot_intervals=True, plot_anomalies=True)
xgb
fig , ax = plt.subplots(figsize = [15,8])
ax.bar(x=X_train.columns,height = xgb.feature_importances_, alpha = 0.4)
sns.despine()
imp_features = X_train.columns[np.argsort(xgb.feature_importances_,)][-5:]
imp_features
y = data.dropna().y
X = data.dropna().drop(['y'], axis=1)
X = X[imp_features]
X_train, X_test, y_train, y_test = ts.timeseries_train_test_split(X, y, test_size=0.3)
lr = LinearRegression()
lr.fit(X_train, y_train)
ts.plotModelResults(lr,X_train,X_test,y_train, y_test, tscv, plot_intervals=True)
ts.plotCoefficients(lr,X_train)
gdp_roc = df.diff().dropna()
gdp_roc.head()
ts.tsplot(gdp_roc.values.squeeze(), lags=60)
gdp_roc.columns = ["y"]
for i in range(2, 16):
gdp_roc["lag_{}".format(i)] = gdp_roc.y.shift(i)
dt = gdp_roc.dropna()
dt[-100:]
dt = dt[-100:]
y = dt.y
X = dt.drop(['y'], axis = 1)
X_train, X_test, y_train, y_test = ts.timeseries_train_test_split(X, y, test_size=0.3)
xgb = XGBRegressor(max_depth=3,n_estimators = 3, learning_rate=0.1)
xgb.fit(X_train,y_train)
%timeit
ts.plotModelResults(xgb,
X_train=X_train,
X_test=X_test,
y_train = y_train, y_test = y_test, tscv = tscv,
plot_intervals=True, plot_anomalies=True)
fig , ax = plt.subplots(figsize = [15,8])
ax.bar(x=X_train.columns,height = xgb.feature_importances_, alpha = 0.4)
sns.despine()
imp_features = X_train.columns[np.argsort(xgb.feature_importances_,)][-5:]
X_train = X_train[imp_features]
lr = LinearRegression()
lr.fit(X_train, y_train)
ts.plotModelResults(lr,X_train,X_test[imp_features],y_train, y_test, tscv, plot_intervals=True)
ts.plotCoefficients(lr,X_train)
sns.pairplot(gdp_roc.dropna()[-100:])
dt = gdp_roc.dropna()[['y','lag_4','lag_8','lag_12']][-100:]
y = dt.y
X = dt.drop(['y'], axis = 1)
X_train, X_test, y_train, y_test = ts.timeseries_train_test_split(X, y, test_size=0.3)
lr = LinearRegression()
lr.fit(X_train, y_train)
ts.plotModelResults(lr,X_train,X_test,y_train, y_test, tscv, plot_intervals=True)
ts.plotCoefficients(lr,X_train)
from sklearn.linear_model import BayesianRidge
dt = gdp_roc.dropna()[-100:]
y = dt.y
X = dt.drop(['y'], axis = 1)
X_train, X_test, y_train, y_test = ts.timeseries_train_test_split(X, y, test_size=0.3)
clf = BayesianRidge(normalize=True)
clf.fit(X_train,y_train)
ts.plotModelResults(clf,X_train,X_test,y_train, y_test, tscv, plot_intervals=True)
ts.plotCoefficients(clf,X_train)
dt = gdp_roc.dropna()[-100:]
y = dt.y
X = dt.drop(['y'], axis = 1)
X_train, X_test, y_train, y_test = ts.timeseries_train_test_split(X, y, test_size=0.3)
results = seasonal_decompose(dt['y'])
fig = results.plot()
fig.set_size_inches(10,6)
sns.despine()
reg = Lasso()
reg.fit(X_train,y_train)
ts.plotModelResults(reg,X_train,X_test,y_train, y_test, tscv, plot_intervals=True)
ts.plotCoefficients(reg,X_train)